Packages and functions¶
In [1]:
import os
import umap
import time
import random
# import anndata
# import argparse
import numpy as np
import pandas as pd
import scanpy as sc
import editdistance
import colorsys as cs
import seaborn as sns
# import memory_profiler
import scipy.sparse as sp
from collections import Counter
import matplotlib.pyplot as plt
from umi_tools import UMIClusterer
from scipy.spatial import procrustes
# from sklearn.decomposition import PCA
# from scipy.spatial.distance import cdist
# from matplotlib.cm import ScalarMappable
# from sklearn.neighbors import NearestNeighbors
from scipy.linalg import orthogonal_procrustes
# from matplotlib.colors import LinearSegmentedColormap, Normalize
# def make_colormap(seq):
# seq = [(None,) * 3, 0.0] + list(seq) + [1.0, (None,) * 3]
# cdict = {'red': [], 'green': [], 'blue': []}
# for i, item in enumerate(seq):
# if isinstance(item, float):
# r1, g1, b1 = seq[i - 1]
# r2, g2, b2 = seq[i + 1]
# cdict['red'].append([item, r1, r2])
# cdict['green'].append([item, g1, g2])
# cdict['blue'].append([item, b1, b2])
# return LinearSegmentedColormap('CustomMap', cdict)
def build_6mer_dist(bc_list):
start_km = {}
mid_km = {}
end_km = {}
for bc in bc_list:
start_km.setdefault(bc[:6] , []).append(bc)
mid_km.setdefault(bc[4:10], []).append(bc)
end_km.setdefault(bc[-6:] , []).append(bc)
return start_km,mid_km,end_km
def barcode_matching(bc_pos_dict,spatial_bc_list,max_dist=1):
bc_matching_dict = {}
def get_sel_bc(bc):
res = []
if bc[:6] in start_km:
res += start_km[bc[:6]]
if bc[-6:] in end_km:
res += end_km[bc[-6:]]
if bc[4:10] in mid_km:
res += mid_km[bc[4:10]]
return set(res)
exact_match = 0
fuzzy_match = 0
bc_ref_list = list(bc_pos_dict.keys())
start_km,mid_km,end_km = build_6mer_dist(bc_ref_list)
for bc in spatial_bc_list:
bc_old = bc
#bc = "".join([bc[it] for it in [1,2,3,4,5,6,7,9,10,11,12,13] ])
if bc in bc_pos_dict:
exact_match += 1
bc_matching_dict[bc_old] = bc
else:
sel_bc = get_sel_bc(bc)
#sel_bc = bc_ref_list
if len(sel_bc)>0:
fz = [(it, editdistance.eval(it, bc)) for it in sel_bc]
fz = [it for it in fz if it[1]<=max_dist]
fz.sort(key=lambda x:x[1])
if len(fz)==0:
continue
## if there are two barcodes with the same edit distance, choose the one with higher error rate in the last base
if len(fz)>1 and fz[0][1]==fz[1][1]:
if editdistance.eval(fz[0][0][:-1], bc[-1])>editdistance.eval(fz[1][0][:-1], bc[-1]): # higher error rate in the last base of the barcode
fuzzy_match += 1
bc_matching_dict[bc_old] = fz[1][0]
elif editdistance.eval(fz[0][0][:-1], bc[-1])<editdistance.eval(fz[1][0][:-1], bc[-1]):
fuzzy_match += 1
bc_matching_dict[bc_old] = fz[0][0]
else:
fuzzy_match += 1
bc_matching_dict[bc_old] = fz[0][0]
return bc_matching_dict,exact_match,fuzzy_match
def umi_collapsing(cnt_dict, max_dist=1):
"""
input: dict of barcode without matching
output: list of barcode after collapsing
"""
start_time = time.time()
clusterer = UMIClusterer(cluster_method="directional")
clustered_bc = clusterer(cnt_dict, threshold=max_dist)
clustering_time = time.time()
cluster_bc = [bc_group[0].decode('utf-8') for bc_group in clustered_bc]
end_time = time.time()
print("Clustering time: {}s".format(clustering_time-start_time))
print("Dict creation time is: {}s".format(end_time-clustering_time))
print("Total time is: {}s".format(end_time-start_time))
return cluster_bc
# def blind_cnt_distribution(bead_all, bead_type):
# plt.figure(figsize=(8,6))
# sns.histplot(np.log10(bead_all['total_cnt']), bins=50)
# plt.xlabel('log10(total count)', fontsize=12)
# plt.ylabel('number of '+bead_type, fontsize=12)
# plt.title('blind '+bead_type+' total count distribution ({}), median={}'.format(len(bead_all), bead_all['total_cnt'].median()), fontsize=16, pad=20)
# plt.tick_params(axis='both', which='major', labelsize=10)
# plt.gca().spines['top'].set_visible(False)
# plt.gca().spines['right'].set_visible(False)
# plt.savefig(os.path.join(out_dir,bead_type+'_blind_cnt_distribution.png'),dpi=300)
# plt.close()
# def blind_cover_bc_distribution(bead_cover_bc, bead_type):
# plt.figure(figsize=(8,6))
# sns.histplot(bead_cover_bc['cnt'], bins=50)
# plt.xlabel('bead covered', fontsize=12)
# plt.ylabel('number of '+bead_type, fontsize=12)
# plt.title(bead_type+' bead covered distribution ({}), median={}'.format(len(bead_cover_bc), bead_cover_bc['cnt'].median()), fontsize=16, pad=20)
# plt.tick_params(axis='both', which='major', labelsize=10)
# plt.gca().spines['top'].set_visible(False)
# plt.gca().spines['right'].set_visible(False)
# plt.savefig(os.path.join(out_dir,bead_type+'_blind_cover_bc_distribution.png'),dpi=300)
# plt.close()
# generate spase matrix from matching, with selection on anchor and target
def get_matrix(match_df,min_a_cnt,max_a_cnt, min_t_cnt,max_t_cnt, anchor, target):
a_all = match_df.groupby(anchor)['cnt'].sum().reset_index(name='total_cnt')
a_sel = a_all.loc[(a_all['total_cnt']>min_a_cnt) & (a_all['total_cnt']<max_a_cnt),]
t_all = match_df.groupby(target)['cnt'].sum().reset_index(name='total_cnt')
t_sel = t_all.loc[(t_all['total_cnt']>min_t_cnt) & (t_all['total_cnt']<max_t_cnt),]
match_df = match_df[(match_df[anchor].isin(a_sel[anchor])) & (match_df[target].isin(t_sel[target]))]
a_list = match_df.groupby(anchor)['cnt'].sum().reset_index(name='total_cnt')
t_list = match_df.groupby(target)['cnt'].sum().reset_index(name='total_cnt')
print('a: {}'.format(len(a_list)))
print('t: {}'.format(len(t_list)))
a_dict = dict()
t_dict = dict()
for i in range(len(a_list)):
a_dict[a_list.iloc[i,0]] = i
for j in range(len(t_list)):
t_dict[t_list.iloc[j,0]] = j
a_coo = []
t_coo = []
[a_coo.append(a_dict[a]) for a in match_df[anchor]]
[t_coo.append(t_dict[t]) for t in match_df[target]]
counts_coo = sp.coo_matrix((match_df['cnt'], (a_coo, t_coo)))
# counts = counts_coo.tocsr().toarray()
counts = counts_coo.tocsr()
return counts, a_list, t_list
def get_col(coords):
plot_df = coords.copy()
# b_list = [1] * len(plot_df)
# r_list = (plot_df.xcoord.values - plot_df.xcoord.min()) / (
# plot_df.xcoord.max() - plot_df.xcoord.min())
# g_list = (plot_df.ycoord.values - plot_df.ycoord.min()) / (
# plot_df.ycoord.max() - plot_df.ycoord.min())
my_h = plot_df['xcoord']
my_h = (my_h - np.min(my_h))
my_h = 0.2+my_h * (0.8 / np.max(my_h))
my_s = np.ones(len(my_h)) * 0.5
my_l = plot_df['ycoord']
my_l = (my_l - np.min(my_l))
my_l = 0.2 + my_l * (0.7 / np.max(my_l))
hls_color = np.column_stack((my_h, my_l, my_s))
rgb_color = [cs.hls_to_rgb(p[0], p[1], p[2]) for p in hls_color]
# rgb_color = [(r_list[i], g_list[i], b_list[i]) for i in range(len(plot_df))]
return rgb_color
def get_truth(wlist,pos_df):
w_truth = pos_df[pos_df['barcode'].isin(wlist)]
w_truth = w_truth.sort_values(by=['barcode'])
w_col = get_col(w_truth)
return w_truth, w_col
def get_pa(pos_truth,pos_recon):
mtx1, mtx2, disparity = procrustes(pos_truth[['xcoord','ycoord']], pos_recon[['xcoord','ycoord']])
pos_truth_translate = pos_truth.copy()
pos_truth_translate['xcoord'] = pos_truth_translate['xcoord'] - np.mean(pos_truth_translate['xcoord'])
pos_truth_translate['ycoord'] = pos_truth_translate['ycoord'] - np.mean(pos_truth_translate['ycoord'])
scaling = np.sqrt(np.trace(np.dot(pos_truth_translate[['xcoord','ycoord']], pos_truth_translate[['xcoord','ycoord']].T)))
scaling_2 = np.sqrt(np.trace(np.dot(pos_truth_translate[['xcoord','ycoord']], pos_truth_translate[['xcoord','ycoord']].T)))/np.sqrt(np.trace(np.dot(mtx2, mtx2.T)))
mtx1_scaled = mtx1*scaling
mtx2_scaled = mtx2*scaling_2
dist = pos_truth_translate.copy()
dist['x'] = mtx1_scaled[:,0] - mtx2_scaled[:,0]
dist['y'] = mtx1_scaled[:,1] - mtx2_scaled[:,1]
dist['r'] = np.sqrt(dist['x']**2 + dist['y']**2)
return disparity, mtx1_scaled, mtx2_scaled, dist
def rigid_transfrom(pos, a_recon):
bc_pos_dict = Counter(pos['barcode'])
a_bc_matching_dict,_,_ = barcode_matching(bc_pos_dict, a_recon[anchor].values, max_dist=1)
a_truth, a_col = get_truth(list(a_bc_matching_dict.values()), pos)
a_umap_matched = a_recon[a_recon[anchor].isin(a_bc_matching_dict.keys())]
a_umap_matched = a_umap_matched.copy()
a_umap_matched.loc[:,'barcode'] = a_umap_matched[anchor].map(a_bc_matching_dict)
a_umap_matched = a_umap_matched[['xcoord','ycoord','barcode']]
a_umap_matched = a_umap_matched.groupby('barcode').mean().reset_index()
a_umap_matched = a_umap_matched.sort_values(by=['barcode'])
mtx1, mtx2, disparity = procrustes(a_truth[['xcoord','ycoord']], a_umap_matched[['xcoord','ycoord']])
a_matched_translate = a_umap_matched.copy()
a_matched_translate['xcoord'] = a_matched_translate['xcoord'] - np.mean(a_matched_translate['xcoord'])
a_matched_translate['ycoord'] = a_matched_translate['ycoord'] - np.mean(a_matched_translate['ycoord'])
R, s = orthogonal_procrustes(a_matched_translate[['xcoord','ycoord']], mtx2)
a_transformed = s * np.dot(a_recon[['xcoord','ycoord']], R)
a_scale = a_recon.copy()
a_scale['xcoord'] = a_transformed[:,0]/(np.max(a_transformed[:,0])-np.min(a_transformed[:,0]))*(np.max(pos.xcoord)-np.min(pos.xcoord))
a_scale['ycoord'] = a_transformed[:,1]/(np.max(a_transformed[:,1])-np.min(a_transformed[:,1]))*(np.max(pos.ycoord)-np.min(pos.ycoord))
a_scale['xcoord'] = a_scale['xcoord'] - np.mean(a_scale['xcoord'])
a_scale['ycoord'] = a_scale['ycoord'] - np.mean(a_scale['ycoord'])
return a_scale
In [2]:
import warnings
warnings.filterwarnings('ignore')
Load data¶
In [3]:
# define sample name and bead type, sample folder path
sample = 'c58_8'
anchor = 'V10' #anchor beads as the bead type what we want coordinates, the polyT beads in Slide-seq and polyA beads in Slide-tags
target = 'V9A30' #target beads as the bead type whose coordinates won't be used in spatial transcriptomics
sample_folder = os.path.join('/mnt/thechenlab/Chenlei/spotmapping/fiducial/Data_Figures/Slide_recon_seq',sample)
In [4]:
# load barcode-barcode conjugation information
blind_raw = pd.read_csv(os.path.join(sample_folder, sample+'_blind_raw_reads_filtered.csv.gz'))
blind_sum = blind_raw.groupby(['R1_bc', 'R2_bc']).size().reset_index(name='cnt')
blind_sum.columns = [anchor,target,'cnt']
# statistics of umi per bead
a_all = blind_sum.groupby(anchor)['cnt'].sum().reset_index(name='total_cnt')
t_all = blind_sum.groupby(target)['cnt'].sum().reset_index(name='total_cnt')
# statistics of covered bead per bead
a_cover_bc = blind_sum.groupby(anchor).count()
t_cover_bc = blind_sum.groupby(target).count()
In [13]:
%reload_ext memory_profiler
%matplotlib inline
Diffusion in ground truth¶
In [6]:
# load beads ground truth location from in situ sequencing, center to (0,0), and scale from pixel to um
pos = pd.read_csv(os.path.join(sample_folder,sample+'_matched_bead_location.csv.gz'))
pos['xcoord'] = pos['xcoord']-np.mean(pos['xcoord'])
pos['ycoord'] = pos['ycoord']-np.mean(pos['ycoord'])
pos['xcoord'] = pos['xcoord']*0.72
pos['ycoord'] = pos['ycoord']*0.72
In [9]:
bead_all = a_all
bead_type = anchor
base_type = target
match_pos = pd.merge(blind_sum, pos, left_on=base_type, right_on='barcode')
bead_all = bead_all.sort_values(by='total_cnt', ascending=False)
Fig1_d_left¶
In [77]:
b_n = 15000
b = bead_all.iloc[b_n, 0]
b_match = match_pos.loc[match_pos[bead_type] == b,]
b_match = b_match.sort_values(by='cnt', ascending=True)
fig, ax = plt.subplots(figsize=(2.5, 2))
scatter = sns.scatterplot(x=b_match['xcoord'], y=b_match['ycoord'], hue=np.log10(b_match['cnt']+1)
,palette='Blues', s=2,legend=None,edgecolor=None)
ax.set_xlim(-1500,1500)
ax.set_ylim(-1500,1500)
ax.set_xlabel('x (um)', fontsize=7)
ax.set_ylabel('y (um)', fontsize=7)
ax.tick_params(axis='both', which='major', labelsize=5)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
norm = plt.Normalize(vmin=np.log10(b_match['cnt']+1).min(), vmax=np.log10(b_match['cnt']+1).max())
sm = plt.cm.ScalarMappable(cmap="Blues", norm=norm)
sm.set_array([])
cbar = plt.colorbar(sm, ax=ax,pad=0.05)
cbar.set_label('log10')
cbar.ax.tick_params(labelsize=5, which='major')
cbar.set_label('log10(UMI)', fontsize=7)
ax.set_aspect('equal')
ax2 = plt.axes([0.14, 0.16, .36, .36])
sns.scatterplot(x=b_match['xcoord'], y=b_match['ycoord'], hue=np.log10(b_match['cnt']+1)
,palette='Blues', s=5,legend=None,edgecolor=None, ax=ax2)
ax2.set_xlim(600,900)
ax2.set_ylim(300,600)
ax2.set_aspect('equal')
ax2.set_xlabel('')
ax2.set_ylabel('')
ax2.set_xticks([])
ax2.set_yticks([])
# plt.savefig(os.path.join(out_dir,'Fig2b_a_diffusion_groundtruth.svg'),dpi=300)
plt.show()
Fig1_d_right¶
In [62]:
x_values = np.linspace(-1500, 1500, 1000)
densities_all = []
bead_all = bead_all.sort_values(by='total_cnt', ascending=False)
bi = range(0, 3000)
for b_n in bi:
if b_n % 200 == 0:
print(b_n)
b = bead_all.iloc[b_n, 0]
b_match = match_pos.loc[match_pos[bead_type] == b,]
if len(b_match) > 4:
b_x = np.repeat(b_match['xcoord'],b_match['cnt'])
b_x -= np.mean(b_x)
kde = gaussian_kde(b_x)
densities = kde.evaluate(x_values)
densities_all.append(np.array(densities))
densities_average = np.mean(np.array(densities_all), axis=0)
0 200 400 600 800 1000 1200 1400 1600 1800 2000 2200 2400 2600 2800
In [28]:
x = x_values
y = densities_average
peak_value = np.max(y)
# Step 2: Find half maximum
half_max = peak_value / 2
# Step 3: Locate points on the distribution
# np.abs(y - half_max) finds the absolute difference between y values and half max
# np.argmin finds the index of the smallest value in this difference array,
# which will be closest to the half maximum
left_idx = np.argmin(np.abs(y[:len(y)//2] - half_max))
right_idx = np.argmin(np.abs(y[len(y)//2:] - half_max)) + len(y)//2
# Step 4: Calculate FWHM
fwhm = x[right_idx] - x[left_idx]
print("FWHM:", fwhm)
FWHM: 123.12312312312315
In [55]:
## Fig2c
plt.figure(figsize=(3.8, 2))
# sns.scatterplot(x=x_values, y=densities_average, label='Ensemble KDE')
sns.lineplot(x=x_values, y=densities_average, label='Ensemble KDE')
sns.scatterplot(x=[x[left_idx],x[right_idx]], y=[y[left_idx],y[right_idx]], s=20,
label='FWHM',c = sns.color_palette("tab10")[1],edgecolor=None)
# plt.xlim(-100,100)
plt.xlabel('X/um', fontsize=7)
plt.ylabel('Frequency of UMI count', fontsize=7)
plt.legend(loc='upper left', frameon=False, bbox_to_anchor=(1, 1), fontsize=7)
# plt.grid(True, linestyle='--', alpha=0.7)
plt.tick_params(axis='both', which='both',labelsize=5)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.tight_layout()
plt.savefig(os.path.join(out_dir,'Fig2c_a_x_average_kde.svg'),dpi=300)
plt.show()
/home/thechenlab/miniconda3/envs/slidelock/lib/python3.11/site-packages/seaborn/relational.py:433: UserWarning: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*. Please use the *color* keyword-argument or provide a 2D array with a single row if you intend to specify the same RGB or RGBA value for all points. points = ax.scatter(x=x, y=y, **kws)
For reviewers: stable diffusion plot¶
In [17]:
# FigS2af
rows = 3
cols = 3
fig, axs = plt.subplots(rows, cols, figsize=(10, 7)) # Adjust figsize as needed
# Loop over all the subplots using a nested loop
for i in range(rows):
for j in range(cols):
ax = axs[i, j]
b_n = random.randint(1000,19000)
b = bead_all.iloc[b_n, 0]
b_match = match_pos.loc[match_pos[bead_type] == b,]
if len(b_match)==0:
b_n = random.randint(1000,19000)
b = bead_all.iloc[b_n, 0]
b_match = match_pos.loc[match_pos[bead_type] == b,]
b_match = b_match.sort_values(by='cnt', ascending=True)
scatter = sns.scatterplot(x=b_match['xcoord'], y=b_match['ycoord'], hue=np.log10(b_match['cnt']+1)
,palette='Blues', s=5,legend=None,edgecolor=None,ax=ax)
ax.set_xlim(-1600,1600)
ax.set_ylim(-1600,1600)
ax.set_xlabel('X ($\mu$m)', fontsize=8)
ax.set_ylabel('y ($\mu$m)', fontsize=8)
ax.set_title(f'cnt: {b_match.cnt.sum()}', fontsize=10)
ax.tick_params(axis='both', which='major', labelsize=8)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
norm = plt.Normalize(vmin=np.log10(b_match['cnt']+1).min(), vmax=np.log10(b_match['cnt']+1).max())
sm = plt.cm.ScalarMappable(cmap="Blues", norm=norm)
sm.set_array([])
cbar = plt.colorbar(sm, ax=ax,pad=0.05,shrink=0.6)
cbar.set_label('log10')
cbar.ax.tick_params(labelsize=8, which='major')
cbar.set_label('log10(UMI)', fontsize=8)
ax.set_aspect('equal')
# Adjust layout to prevent overlap
plt.tight_layout()
# plt.savefig(os.path.join('/mnt/thechenlab/Chenlei/spotmapping/fiducial/Figures/FigS2','FigS2af_diffusion_examples.png'),dpi=300)
plt.show()
In [346]:
## FigS2ag
plt.figure(figsize=(4,3))
sns.histplot(np.log10(bead_all['total_cnt']), bins=50)
plt.xlabel('log10(total count)', fontsize=10)
plt.ylabel('number of beads', fontsize=10)
plt.title('{} beads\n median cnt={}'.format(len(bead_all), bead_all['total_cnt'].median()), fontsize=12, pad=20)
plt.tick_params(axis='both', which='major', labelsize=8)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.tight_layout()
plt.savefig(os.path.join('/mnt/thechenlab/Chenlei/spotmapping/fiducial/Figures/FigS2','FigS2ag_count_per_bead.png'),dpi=300)
plt.show()
In [ ]:
from scipy.stats import gaussian_kde
fwhm_list = []
for b_n in range(len(bead_all)):
if b_n%100==0:
print(b_n)
b = bead_all.iloc[b_n, 0]
b_match = match_pos.loc[match_pos[bead_type] == b,]
if len(b_match)==0:
continue
b_match = b_match.sort_values(by='cnt', ascending=True)
x = b_match['xcoord']
# KDE fit
kde = gaussian_kde(x)
x_grid = np.linspace(x.min(), x.max(), 1000)
kde_values = kde(x_grid)
x = x_grid
y = kde_values
peak_value = np.max(y)
half_max = peak_value / 2
left_idx = np.argmin(np.abs(y[:len(y)//2] - half_max))
right_idx = np.argmin(np.abs(y[len(y)//2:] - half_max)) + len(y)//2
fwhm = x[right_idx] - x[left_idx]
fwhm_list.append(fwhm)
0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 2500 2600 2700 2800 2900 3000 3100 3200 3300 3400 3500 3600 3700 3800 3900 4000 4100 4200 4300 4400 4500 4600 4700 4800 4900 5000 5100 5200 5300 5400 5500 5600 5700 5800 5900 6000 6100 6200 6300 6400 6500 6600 6700 6800 6900 7000 7100 7200 7300 7400 7500 7600 7700 7800 7900 8000 8100 8200 8300 8400 8500 8600 8700 8800 8900 9000 9100 9200 9300 9400 9500 9600 9700 9800 9900 10000
In [43]:
## FigS2aj
plt.figure(figsize=(4,3))
sns.histplot(fwhm_list, bins=50)
plt.xlabel('FWHM (µm)', fontsize=10)
plt.ylabel('Number of Beads', fontsize=10)
plt.title('FWHM Median = {:.1f} µm'.format(np.median(fwhm_list)), fontsize=12, pad=20)
plt.tick_params(axis='both', which='major', labelsize=8)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.tight_layout()
plt.savefig(os.path.join('/mnt/thechenlab/Chenlei/spotmapping/fiducial/Figures/FigS2','FigS2aj_fwhm_distribution.png'),dpi=300)
plt.show()
In [102]:
fwhm_list = []
x_values = np.linspace(-1500, 1500, 1000)
bi = range(0, 3000)
for b_n in bi:
if b_n%100==0:
print(b_n)
b = bead_all.iloc[b_n, 0]
b_match = match_pos.loc[match_pos[bead_type] == b,]
if len(b_match)<10:
continue
b_x = np.repeat(b_match['xcoord'],b_match['cnt'])
b_x -= np.median(b_x)
kde = gaussian_kde(b_x)
densities = kde.evaluate(x_values)
y = densities
peak_value = np.max(y)
half_max = peak_value / 2
left_idx = np.argmin(np.abs(y[:len(y)//2] - half_max))
right_idx = np.argmin(np.abs(y[len(y)//2:] - half_max)) + len(y)//2
fwhm = x_values[right_idx] - x_values[left_idx]
fwhm_list.append(fwhm)
0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 2500 2600 2700 2800 2900
In [96]:
b_n = 4
b = bead_all.iloc[b_n, 0]
b_match = match_pos.loc[match_pos[bead_type] == b,]
b_x = np.repeat(b_match['xcoord'],b_match['cnt'])
b_x -= np.median(b_x)
kde = gaussian_kde(b_x)
densities = kde.evaluate(x_values)
y = densities
peak_value = np.max(y)
half_max = peak_value / 2
left_idx = np.argmin(np.abs(y[:len(y)//2] - half_max))
right_idx = np.argmin(np.abs(y[len(y)//2:] - half_max)) + len(y)//2
fwhm = x_values[right_idx] - x_values[left_idx]
print(fwhm)
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
# Scatter plot
ax1.scatter(b_match.xcoord,b_match.cnt, alpha=0.5)
ax1.set_xlabel('xcoord')
ax1.set_ylabel('Normalized cnt')
ax1.set_title('Scatter Plot')
# KDE plot
ax2.plot(x_values, densities, linewidth=2)
ax2.scatter(x=[x_values[left_idx],x_values[right_idx]], y=[y[left_idx],y[right_idx]],c='red')
ax2.set_xlabel('cnt')
ax2.set_ylabel('Density')
ax2.set_title('KDE on cnt')
# Show plots
plt.tight_layout()
plt.show()
117.117117117117
In [118]:
## FigS2aj
plt.figure(figsize=(4,3))
sns.histplot(fwhm_list, bins=100)
plt.xlabel('FWHM (µm)', fontsize=10)
plt.ylabel('Number of Beads', fontsize=10)
plt.title('FWHM per diffusion distribution \nMedian = {:.1f} µm'.format(np.median(fwhm_list)), fontsize=12, pad=20)
plt.tick_params(axis='both', which='major', labelsize=8)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.xlim(0,600)
plt.tight_layout()
plt.savefig(os.path.join('/mnt/thechenlab/Chenlei/spotmapping/fiducial/Figures/FigS2','FigS2aj_fwhm_distribution.png'),dpi=300)
plt.show()
Generate diffusion matrix¶
In [316]:
%%memit
time0 = time.time()
# define filtering of beads based on minmum or maximum count per bead
a_min = 0
a_max = 100000
t_min = 0
t_max = 100000
# construct anchor by target diffusion matrix
counts, a_sel, t_sel = get_matrix(blind_sum, min_a_cnt=a_min, max_a_cnt=a_max, min_t_cnt=t_min, max_t_cnt=t_max, anchor=anchor, target=target)
time2 = time.time()
print('time for generating matrix: ', time2-time0)
a: 19408 t: 19681 time for generating matrix: 9.368290424346924 peak memory: 45391.08 MiB, increment: 1.38 MiB
In [8]:
# Save the CSR matrix to a file
save_npz(os.path.join(sample_folder, sample+'_counts.npz'), counts)
In [18]:
from scipy.sparse import save_npz, load_npz
# Load the CSR matrix from the file
counts = load_npz(os.path.join(sample_folder, sample+'_counts.npz'))
In [19]:
counts
Out[19]:
<19408x19681 sparse matrix of type '<class 'numpy.int64'>' with 9070895 stored elements in Compressed Sparse Row format>
Reconstruction with UMAP¶
In [14]:
%%memit
n_epo = 50000
time5 = time.time()
reducer = umap.UMAP(metric='cosine',
n_neighbors=25,
min_dist=0.99,
low_memory=False,
n_components=2,
# random_state=0,
verbose=True,
n_epochs=n_epo,
# output_dens = True,
# local_connectivity = 30,
learning_rate = 1)
embedding = reducer.fit_transform(np.log1p(counts))
time_loop = time.time()
print('umap: ', time_loop-time5)
UMAP(angular_rp_forest=True, learning_rate=1, low_memory=False, metric='cosine', min_dist=0.99, n_epochs=50000, n_neighbors=25, verbose=True) Sat Mar 30 12:06:17 2024 Construct fuzzy simplicial set Sat Mar 30 12:06:17 2024 Finding Nearest Neighbors Sat Mar 30 12:06:17 2024 Building RP forest with 12 trees Sat Mar 30 12:06:20 2024 metric NN descent for 14 iterations 1 / 14 2 / 14 3 / 14 Stopping threshold met -- exiting after 3 iterations Sat Mar 30 12:07:37 2024 Finished Nearest Neighbor Search Sat Mar 30 12:07:39 2024 Construct embedding
Epochs completed: 0%| 0/50000 [00:00]
completed 0 / 50000 epochs completed 5000 / 50000 epochs completed 10000 / 50000 epochs completed 15000 / 50000 epochs completed 20000 / 50000 epochs completed 25000 / 50000 epochs completed 30000 / 50000 epochs completed 35000 / 50000 epochs completed 40000 / 50000 epochs completed 45000 / 50000 epochs Sat Mar 30 12:32:50 2024 Finished embedding umap: 1593.8311195373535 peak memory: 6655.95 MiB, increment: 304.25 MiB
In [29]:
a_recon = pd.DataFrame(embedding)
a_recon.columns = ['xcoord','ycoord']
a_recon.insert(loc=0, column=anchor, value=a_sel[anchor])
In [14]:
a_recon = pd.read_csv(os.path.join(sample_folder, sample+'_'+anchor+'_recon_loc.csv'))
In [15]:
# a rigid transformation from recon result to ground truth by scaling, translation, rotation and reflection
a_recon = rigid_transfrom(pos, a_recon)
Evaluation with absolute error¶
In [30]:
# matching between recon barcodes and in situ sequencing barcode list with 1 mismatch, find common barcodes
bc_pos_dict = Counter(pos['barcode'])
a_bc_matching_dict,_,_ = barcode_matching(bc_pos_dict, a_sel[anchor].values, max_dist=1)
# get ground truth locations of matched common barcodes,
a_truth, a_col = get_truth(list(a_bc_matching_dict.values()), pos)
In [ ]:
# need to reload groundtruth
pos = pd.read_csv(os.path.join(sample_folder, sample+'_matched_bead_location.csv.gz'))
bc_pos_dict = Counter(pos['barcode'])
a_bc_matching_dict,_,_ = barcode_matching(bc_pos_dict, a_sel[anchor].values, max_dist=1)
a_truth, a_col = get_truth(list(a_bc_matching_dict.values()), pos)
In [62]:
recon_umap = pd.DataFrame(embedding)
a_umap = recon_umap.copy()
a_umap[anchor] = a_sel[anchor].values
a_umap.columns = ['xcoord', 'ycoord', anchor]
a_umap_matched = a_umap[a_umap[anchor].isin(a_bc_matching_dict.keys())]
a_umap_matched = a_umap_matched.copy()
a_umap_matched.loc[:,'barcode'] = a_umap_matched[anchor].map(a_bc_matching_dict)
a_umap_matched = a_umap_matched[['xcoord','ycoord','barcode']]
a_umap_matched = a_umap_matched.groupby('barcode').mean().reset_index()
a_umap_matched = a_umap_matched.sort_values(by=['barcode'])
Fig1_e¶
In [63]:
# need to scale by 0.72 from pixel to µm
plt.figure(figsize=(2, 2))
sns.scatterplot(x=a_truth['xcoord'], y=a_truth['ycoord'], c=a_col, s=0.8, edgecolor=None)
plt.tick_params(axis='both', which='major', labelsize=5)
plt.xlabel('')
plt.ylabel('')
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
x_cmap = make_colormap([(0, 0, 1), (1, 0, 1)]) # Blue to Magenta for x
y_cmap = make_colormap([(0, 0, 1), (0, 1, 1)]) # Blue to Cyan for y
fig = plt.gcf()
sm_x = ScalarMappable(cmap=x_cmap, norm=Normalize(0,1))
sm_x.set_array([])
cb_ax_x = fig.add_axes([0.13, 0.1, 0.2, 0.02]) # Adjust these values to position your colorbar
cbar_x = plt.colorbar(sm_x, cax=cb_ax_x, orientation='horizontal', ticks=[])
cbar_x.set_label('x')
sm_y = ScalarMappable(cmap=y_cmap, norm=Normalize(0,1))
sm_y.set_array([])
cb_ax_y = fig.add_axes([0.13, 0.1, 0.02, 0.2]) # Adjust these values to position your colorbar
cbar_y = plt.colorbar(sm_y, cax=cb_ax_y, orientation='vertical', ticks=[])
cbar_y.set_label('y')
# plt.savefig(os.path.join('/mnt/thechenlab/Chenlei/spotmapping/fiducial/Figures/Fig2','Fig2q_c58_8_truth.svg'),dpi=300)
# plt.close()
plt.show()
Fig1_f¶
In [64]:
plt.figure(figsize=(2, 2))
sns.scatterplot(x=a_umap_matched['xcoord'], y=a_umap_matched['ycoord'], c=a_col, s=0.8, edgecolor=None)
plt.tick_params(axis='both', which='major', labelsize=5)
plt.xlabel('')
plt.ylabel('')
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
# plt.savefig(os.path.join('/mnt/thechenlab/Chenlei/spotmapping/fiducial/Figures/Fig2','Fig2r_c58_8_recon.svg'),dpi=300)
# plt.close()
plt.show()
FigS3_a¶
In [70]:
close_index = [index for index, element in enumerate(a_comp['r'].values) if element < 300]
fig = plt.figure(figsize=(2, 2))
ax = plt.axes()
im = ax.scatter(x=a_truth['xcoord'].values[close_index], y=a_truth['ycoord'].values[close_index], c=a_comp['r'].values[close_index], alpha=1, s=2.5, edgecolor='none')
# plt.title('Simulated Anchor Location ({})'.format(n_a), fontsize=16, pad=20)
# plt.annotate('simulated anchor location ({})'.format(n_a), xy=(0.5, -0), xycoords='axes fraction', ha='center', va='center', fontsize=16)
plt.xlabel('X', fontsize=7)
plt.ylabel('Y', fontsize=7)
plt.tick_params(axis='both', which='major', labelsize=5)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
cax = fig.add_axes([ax.get_position().x1+0.01,ax.get_position().y0,0.02,ax.get_position().height])
cbar = plt.colorbar(im, cax=cax, pad=0.05)
cbar.ax.tick_params(labelsize=5, which='major')
cbar.set_label('error', fontsize=7)
# plt.savefig(os.path.join('/mnt/thechenlab/Chenlei/spotmapping/fiducial/Figures/FigS3','FigS3e_c58_8_a_error_spatial.svg'),dpi=300)
# plt.close()
plt.show()
FigS3_b¶
In [60]:
da, a_truth_pa, a_recon_pa, a_comp = get_pa(a_truth,a_umap_matched)
In [61]:
idx_a = [random.randint(1, len(a_truth_pa)-1) for _ in range(3000)]
fig, ax = plt.subplots(figsize=(2, 2))
# Plotting the starting and ending points
for (x1, y1), (x2, y2) in zip(a_truth_pa[idx_a,:], a_recon_pa[idx_a,:]):
# ax.scatter(x1, y1, color='blue', marker='o', s=15, label='truth') # starting point
# ax.scatter(x2, y2, color='red', marker='x', s=15, label='recon') # ending point
if abs(x2-x1)<200 and abs(y2-y1)<200:
ax.arrow(x1, y1, x2 - x1, y2 - y1, width=10, head_width=30, head_length=30, edgecolor='none', shape = 'full')
# Setting labels and title
plt.xlabel('X', fontsize=7)
plt.ylabel('Y', fontsize=7)
plt.tick_params(axis='both', which='major', labelsize=5)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
# plt.savefig(os.path.join('/mnt/thechenlab/Chenlei/spotmapping/fiducial/Figures/FigS3','FigS3c_c58_8_displacement_vectors.svg'),dpi=300)
plt.show()
FigS3_c¶
In [65]:
plt.figure(figsize=(3, 2))
sns.histplot(a_comp['r'], bins=np.arange(0,200,5), alpha=0.7, edgecolor='black')
plt.xlabel('error', fontsize=7)
plt.ylabel('number of beads', fontsize=7)
# plt.grid(True, linestyle='--', alpha=0.7)
plt.xlim(0,200)
plt.tick_params(axis='both', which='major', labelsize=5)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
# plt.savefig(os.path.join('/mnt/thechenlab/Chenlei/spotmapping/fiducial/Figures/FigS2','FigS2d_c58_8_error_distribution.svg'),dpi=300)
plt.show()
Error at measurement length¶
In [72]:
a_comp.reset_index(drop=True, inplace=True)
indices = a_comp[a_comp['r'] < 200].index.tolist()
a_truth_pa_pairwise = cdist(a_truth_pa[indices,:], a_truth_pa[indices,:])
a_recon_pa_pairwise = cdist(a_recon_pa[indices,:], a_recon_pa[indices,:])
diff = a_recon_pa_pairwise - a_truth_pa_pairwise
intervals = [(i*30+0.001, (i+1)*30) for i in range(100)]
rms_values = []
rms_sd = []
a_truth_pairwise = a_truth_pa_pairwise.flatten()
diff = diff.flatten()
for interval in intervals:
mask = (a_truth_pairwise >= interval[0]) & (a_truth_pairwise < interval[1])
corresponding_values = diff[mask]
rms = np.sqrt(np.mean(corresponding_values**2))
sd = np.std(np.sqrt(corresponding_values**2))
rms_values.append(rms)
rms_sd.append(sd)
Fig1_i¶
In [79]:
plt.figure(figsize=(2, 1.5))
plt.plot([i*30 for i in range(100)],rms_values_c58_8, label='data')
plt.fill_between([i*30 for i in range(100)], [rms_values_c58_8[i] - rms_sd_c58_8[i] for i in range(100)],
[rms_values_c58_8[i] + rms_sd_c58_8[i] for i in range(100)], alpha=0.1)
plt.plot([i*30 for i in range(100)],rms_values_c58_4, label='Replicate 1')
plt.fill_between([i*30 for i in range(100)], [rms_values_c58_4[i] - rms_sd_c58_4[i] for i in range(100)],
[rms_values_c58_4[i] + rms_sd_c58_4[i] for i in range(100)], alpha=0.1)
plt.plot([i*30 for i in range(100)],rms_values_c58_5, label='Replicate 2')
plt.fill_between([i*30 for i in range(100)], [rms_values_c58_5[i] - rms_sd_c58_5[i] for i in range(100)],
[rms_values_c58_5[i] + rms_sd_c58_5[i] for i in range(100)], alpha=0.1)
plt.plot([i*30 for i in range(100)],[20]*100, linestyle='dashed', label='Slide-seq', linewidth=1)
plt.xlim(0,2500)
plt.ylim(0,60)
# plt.legend(loc='upper left', frameon=False, bbox_to_anchor=(1, 1), fontsize=7)
plt.legend(loc='upper left', frameon=False, bbox_to_anchor=(1, 1), fontsize=7)
plt.xlabel('Distance/um', fontsize=7)
plt.ylabel('RMS of error/um', fontsize=7)
plt.tick_params(axis='both', which='major', labelsize=5)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
# plt.savefig(os.path.join('/mnt/thechenlab/Chenlei/spotmapping/fiducial/Figures/Fig2'
# ,'Fig2o_RMS_error_compare.svg'),dpi=300)
# plt.close()
plt.show()
FigS3_d¶
In [78]:
plt.figure(figsize=(2, 1.5))
plt.plot([i*30 for i in range(1,100)],[rms_values_c58_8[i]/(i*30) for i in range(1,100)], label='data in (h)')
# plt.plot([i*30 for i in range(100)],[20]*100, linestyle='dashed', label='Slide-seq')
plt.fill_between([i*30 for i in range(1,100)], [(rms_values_c58_8[i] - rms_sd_c58_8[i])/(i*30) for i in range(1,100)],
[(rms_values_c58_8[i] + rms_sd_c58_8[i])/(i*30) for i in range(1,100)], alpha=0.1)
plt.plot([i*30 for i in range(1,100)],[rms_values_c58_4[i]/(i*30) for i in range(1,100)], label='Replicate 2')
# plt.plot([i*30 for i in range(100)],[20]*100, linestyle='dashed', label='Slide-seq')
plt.fill_between([i*30 for i in range(1,100)], [(rms_values_c58_4[i] - rms_sd_c58_4[i])/(i*30) for i in range(1,100)],
[(rms_values_c58_4[i] + rms_sd_c58_4[i])/(i*30) for i in range(1,100)], alpha=0.1)
plt.plot([i*30 for i in range(1,100)],[rms_values_c58_5[i]/(i*30) for i in range(1,100)], label='Replicate 3')
# plt.plot([i*30 for i in range(100)],[20]*100, linestyle='dashed', label='Slide-seq')
plt.fill_between([i*30 for i in range(1,100)], [(rms_values_c58_5[i] - rms_sd_c58_5[i])/(i*30) for i in range(1,100)],
[(rms_values_c58_5[i] + rms_sd_c58_5[i])/(i*30) for i in range(1,100)], alpha=0.1)
# plt.plot([i*30 for i in range(100)],[20]*100, linestyle='dashed', label='Slide-seq', linewidth=1)
plt.xlim(0,2500)
plt.ylim(0,0.13)
# plt.legend(loc='upper left', frameon=False, bbox_to_anchor=(1, 1), fontsize=7)
plt.legend(loc='upper left', frameon=False, bbox_to_anchor=(1, 1), fontsize=7)
plt.xlabel('Measurement Length/um', fontsize=7)
plt.ylabel('Relative RMS of error/um', fontsize=7)
plt.tick_params(axis='both', which='major', labelsize=5)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
# plt.savefig(os.path.join('/mnt/thechenlab/Chenlei/spotmapping/fiducial/Figures/FigS2'
# ,'FigS2q_relative_RMS_error_compare.svg'),dpi=300)
# # plt.close()
plt.show()
Reconstruction different from in situ seq¶
In [130]:
# load data of diffusion distribution in ground truth
match_raw = pd.read_csv(os.path.join(sample_folder,sample+'_raw_matched_reads.csv.gz'))
match_sum = match_raw.groupby(['P5_bc', 'V9_bc']).size().reset_index(name='cnt')
match_sum.columns = [anchor, target, 'cnt']
a_all = match_sum.groupby(anchor)['cnt'].sum().reset_index(name='total_cnt')
t_all = match_sum.groupby(target)['cnt'].sum().reset_index(name='total_cnt')
bead_all = a_all
bead_type = anchor
base_type = target
match_pos = pd.merge(match_sum, pos, left_on=base_type, right_on='barcode')
bead_all = bead_all.sort_values(by='total_cnt', ascending=False)
In [123]:
a_bc_matching_dict,_,_ = barcode_matching(Counter(pos['barcode']), a_recon[anchor].values, max_dist=1)
a_truth, a_col = get_truth(list(a_bc_matching_dict.values()), pos)
a_recon_matched = a_recon[a_recon[anchor].isin(a_bc_matching_dict.keys())].copy()
a_recon_matched.loc[:,'barcode'] = a_recon_matched[anchor].map(a_bc_matching_dict)
a_recon_matched = a_recon_matched[['xcoord','ycoord','barcode']]
a_recon_matched = a_recon_matched.groupby('barcode').mean().reset_index()
a_recon_matched = a_recon_matched.sort_values(by=['barcode'])
In [124]:
da, a_truth_pa, a_recon_pa, a_comp = get_pa(a_truth,a_recon_matched)
In [125]:
len(a_comp[a_comp.r>200])/len(a_comp)
Out[125]:
0.0012215232394796312
In [132]:
palette = sns.color_palette()
In [137]:
# FigS2x
rows = 5
cols = 4
fig, axs = plt.subplots(rows, cols, figsize=(12, 14)) # Adjust figsize as needed
# Loop over all the subplots using a nested loop
for i in range(rows):
for j in range(cols):
ax = axs[i, j]
b = a_comp[a_comp.r > 200].barcode.values[4*i+j]
b_match = match_pos.loc[match_pos[bead_type] == b]
b_match = b_match.sort_values(by='cnt', ascending=True)
scatter = sns.scatterplot(x=b_match['xcoord'], y=b_match['ycoord'], hue=np.log10(b_match['cnt']+1)
,palette='Blues', s=5,legend=None,edgecolor=None,ax=ax)
sns.scatterplot(x=[a_truth_pa[np.where(a_comp.barcode==b),0][0][0]], y=[a_truth_pa[np.where(a_comp.barcode==b),1][0][0]]
, c=palette[2],s=7,legend=None,edgecolor=None,ax=ax)
sns.scatterplot(x=[a_recon_pa[np.where(a_comp.barcode==b),0][0][0]], y=[a_recon_pa[np.where(a_comp.barcode==b),1][0][0]]
, c=palette[1],s=7,legend=None,edgecolor=None,ax=ax)
ax.set_xlim(-1600,1600)
ax.set_ylim(-1600,1600)
ax.set_xlabel('X ($\mu$m)', fontsize=8)
ax.set_ylabel('y ($\mu$m)', fontsize=8)
ax.tick_params(axis='both', which='major', labelsize=8)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
norm = plt.Normalize(vmin=np.log10(b_match['cnt']+1).min(), vmax=np.log10(b_match['cnt']+1).max())
sm = plt.cm.ScalarMappable(cmap="Blues", norm=norm)
sm.set_array([])
cbar = plt.colorbar(sm, ax=ax,pad=0.05,shrink=0.5)
cbar.set_label('log10')
cbar.ax.tick_params(labelsize=8, which='major')
cbar.set_label('log10(UMI)', fontsize=8)
ax.set_aspect('equal')
# Adjust layout to prevent overlap
plt.tight_layout()
plt.savefig(os.path.join('/mnt/thechenlab/Chenlei/spotmapping/fiducial/Figures/FigS2','FigS2x_insitu_error_all.png'),dpi=300)
plt.show()
UMAP parameters¶
In [20]:
from cuml.manifold.umap import UMAP as cuUMAP
FigS6¶
In [64]:
n_nei_list = [5,10,25,40]
m_dist_list = [0.1,0.4,0.7,0.9]
embedding_a_list = []
for n_nei in n_nei_list:
for m_dist in m_dist_list:
time5 = time.time()
reducer = cuUMAP(metric='cosine',
n_neighbors=n_nei,
min_dist=m_dist,
# low_memory=False,
n_components=2,
# random_state=0,
# verbose=True,
n_epochs=50000,
# output_dens = True,
# local_connectivity = 30,
learning_rate = 1)
embedding_a = reducer.fit_transform(np.log1p(counts))
embedding_a_list.append(embedding_a)
time_loop = time.time()
print('umap: ', time_loop-time5)
umap: 2.1813619136810303 umap: 2.105126142501831 umap: 2.126061201095581 umap: 2.154585838317871 umap: 2.4830498695373535 umap: 2.512467861175537 umap: 2.5037102699279785 umap: 2.515470504760742 umap: 3.1340420246124268 umap: 3.1935245990753174 umap: 3.1323208808898926 umap: 3.1374306678771973 umap: 3.5958411693573 umap: 3.6072001457214355 umap: 3.581209897994995 umap: 3.632389545440674
In [65]:
embedding_a_matched_list = []
for embedding in embedding_a_list:
recon_umap = pd.DataFrame(embedding)
a_umap = recon_umap.copy()
# a_umap[anchor] = adata.obs.index.values
a_umap[anchor] = a_sel[anchor].values
# a_umap[target] = t_sel[target].values
a_umap.columns = ['xcoord', 'ycoord', anchor]
a_umap_matched = a_umap[a_umap[anchor].isin(a_bc_matching_dict.keys())]
a_umap_matched = a_umap_matched.copy()
a_umap_matched.loc[:,'barcode'] = a_umap_matched[anchor].map(a_bc_matching_dict)
a_umap_matched = a_umap_matched[['xcoord','ycoord','barcode']]
a_umap_matched = a_umap_matched.groupby('barcode').mean().reset_index()
a_umap_matched = a_umap_matched.sort_values(by=['barcode'])
embedding_a_matched_list.append(a_umap_matched[['xcoord','ycoord']].values)
In [86]:
# FigS2y
fig, axes = plt.subplots(len(n_nei_list), len(m_dist_list), figsize=(8, 9))
for i, ax_row in enumerate(axes):
for j, ax in enumerate(ax_row):
sns.scatterplot(x=embedding_a_matched_list[i*len(m_dist_list)+j][:,0], y=embedding_a_matched_list[i*len(m_dist_list)+j][:,1]
, c=a_col, alpha=0.8, s=1.5, edgecolor=None, ax=ax)
ax.set_xticks([])
ax.set_yticks([])
for spine in ax.spines.values():
spine.set_visible(False)
ax.set_xlabel('')
ax.set_ylabel('')
ax.axis('equal')
ax.set_title(f"n_neighbors: {n_nei_list[i]}\nmin_dist: {m_dist_list[j]}", fontsize=8)
plt.tight_layout()
plt.savefig(os.path.join('/mnt/thechenlab/Chenlei/spotmapping/fiducial/Figures/FigS2/FigS2y_parameters_nnei_mdist.png'),dpi=300)
plt.show()
In [69]:
a_comp_list = []
for i in range(len(n_nei_list)*len(m_dist_list)):
embedding_a = embedding_a_matched_list[i]
da, a_truth_pa, a_recon_pa, a_comp = get_pa(a_truth, pd.DataFrame(embedding_a,columns=['xcoord','ycoord']))
a_comp_list.append(a_comp.r.values)
In [70]:
a_comp_flat = [item for sublist in a_comp_list for item in sublist]
parameters = [f"n_neig={n_nei_list[i]}, min_dist={m_dist_list[j]}" for i in range(len(n_nei_list)) for j in range(len(m_dist_list))]
repeated_parameters = [param for param in parameters for _ in range(len(a_comp_list[0]))]
a_error = {
'error': a_comp_flat,
'n_neighbor': [i.split(', ')[0] for i in repeated_parameters],
'min_dist': [i.split(', ')[1] for i in repeated_parameters]
}
a_error_df = pd.DataFrame(a_error)
In [99]:
## FigS2z
fig, ax1 = plt.subplots(figsize=(7.5, 3))
sns.boxplot(x="n_neighbor", y="error",hue='min_dist',data=a_error_df,saturation=0.8)
ax1.legend(loc='upper left', frameon=False, bbox_to_anchor=(1, 1), fontsize=10)
ax1.set_ylabel(r"Error ($\mu$m)", fontsize=10)
ax1.set_xlabel('Parameters', fontsize=10)
ax1.tick_params(axis='both', which='major', labelsize=8)
ax1.spines['top'].set_visible(False)
ax1.spines['right'].set_visible(False)
ax1.set_yscale('log')
plt.tight_layout()
plt.savefig(os.path.join('/mnt/thechenlab/Chenlei/spotmapping/fiducial/Figures/FigS2/FigS2z_parameters_nnei_mdist_box.png'),dpi=300)
plt.show()
FigS5¶
In [50]:
n_epo_list = [500,1000,5000,10000,100000]
metric_list = ['euclidean','cosine']
embedding_a_list_2 = []
for n_epo in n_epo_list:
for m in metric_list:
time5 = time.time()
reducer = cuUMAP(metric=m,
n_neighbors=25,
min_dist=0.99,
# low_memory=False,
n_components=2,
# random_state=0,
# verbose=True,
n_epochs=n_epo,
# output_dens = True,
# local_connectivity = 30,
learning_rate = 1)
embedding_a = reducer.fit_transform(counts)
embedding_a_list_2.append(embedding_a)
time_loop = time.time()
print('umap: ', time_loop-time5)
time5 = time.time()
reducer = cuUMAP(metric=m,
n_neighbors=25,
min_dist=0.99,
# low_memory=False,
n_components=2,
# random_state=0,
# verbose=True,
n_epochs=n_epo,
# output_dens = True,
# local_connectivity = 30,
learning_rate = 1)
embedding_a = reducer.fit_transform(np.log1p(counts))
embedding_a_list_2.append(embedding_a)
time_loop = time.time()
print('umap: ', time_loop-time5)
umap: 2.96703839302063 umap: 2.9434497356414795 umap: 1.5086262226104736 umap: 1.5352637767791748 umap: 2.942100763320923 umap: 2.962200403213501 umap: 1.5224533081054688 umap: 1.531818151473999 umap: 3.1992783546447754 umap: 3.1031429767608643 umap: 1.6672801971435547 umap: 1.676203727722168 umap: 3.5135278701782227 umap: 3.3104631900787354 umap: 1.7958929538726807 umap: 1.8281021118164062 umap: 9.239009857177734 umap: 7.011570453643799 umap: 4.7081403732299805 umap: 4.705032110214233
In [51]:
embedding_a_matched_list_2 = []
for embedding in embedding_a_list_2:
recon_umap = pd.DataFrame(embedding)
a_umap = recon_umap.copy()
# a_umap[anchor] = adata.obs.index.values
a_umap[anchor] = a_sel[anchor].values
# a_umap[target] = t_sel[target].values
a_umap.columns = ['xcoord', 'ycoord', anchor]
a_umap_matched = a_umap[a_umap[anchor].isin(a_bc_matching_dict.keys())]
a_umap_matched = a_umap_matched.copy()
a_umap_matched.loc[:,'barcode'] = a_umap_matched[anchor].map(a_bc_matching_dict)
a_umap_matched = a_umap_matched[['xcoord','ycoord','barcode']]
a_umap_matched = a_umap_matched.groupby('barcode').mean().reset_index()
a_umap_matched = a_umap_matched.sort_values(by=['barcode'])
embedding_a_matched_list_2.append(a_umap_matched[['xcoord','ycoord']].values)
In [52]:
# FigS2aa
fig, axes = plt.subplots(5, 4, figsize=(10, 14))
for i, ax_row in enumerate(axes):
for j, ax in enumerate(ax_row):
sns.scatterplot(x=embedding_a_matched_list_2[i*4+j][:,0], y=embedding_a_matched_list_2[i*4+j][:,1]
, c=a_col, alpha=1, s=2,edgecolor=None, ax=ax)
ax.set_xticks([])
ax.set_yticks([])
for spine in ax.spines.values():
spine.set_visible(False)
ax.set_xlabel('')
ax.set_ylabel('')
ax.axis('equal')
if j==0:
ax.set_title(f"n_epochs: {n_epo_list[i]},euclidean", fontsize=8)
elif j==1:
ax.set_title(f"n_epochs: {n_epo_list[i]},euclidean,log1p", fontsize=8)
elif j==2:
ax.set_title(f"n_epochs: {n_epo_list[i]},cosine", fontsize=8)
elif j==3:
ax.set_title(f"n_epochs: {n_epo_list[i]},cosine,log1p", fontsize=8)
plt.tight_layout()
plt.savefig(os.path.join('/mnt/thechenlab/Chenlei/spotmapping/fiducial/Figures/FigS2/FigS2aa_parameters_epochs.png'),dpi=300)
plt.show()
In [53]:
a_comp_list_2 = []
for i in range(20):
embedding_a = embedding_a_matched_list_2[i]
da, a_truth_pa, a_recon_pa, a_comp = get_pa(a_truth, pd.DataFrame(embedding_a,columns=['xcoord','ycoord']))
a_comp_list_2.append(a_comp.r.values)
In [54]:
a_comp_flat = [item for sublist in a_comp_list_2 for item in sublist]
metric_list = [['euclidean']*len(a_comp_list_2[0]),['euclidean,log1p']*len(a_comp_list_2[0])
,['cosine']*len(a_comp_list_2[0]),['cosine,log1p']*len(a_comp_list_2[0])]*5
metric_flat = [item for sublist in metric_list for item in sublist]
a_error_2 = {
'error': a_comp_flat,
'n_epochs': [n for n in n_epo_list for _ in range(4*len(a_comp_list_2[0]))],
'metric': metric_flat
}
a_error_df_2 = pd.DataFrame(a_error_2)
In [57]:
## FigS2ab
fig, ax1 = plt.subplots(figsize=(10, 4))
sns.boxplot(x="n_epochs", y="error",hue='metric',data=a_error_df_2,saturation=0.8)
ax1.legend(loc='upper left', frameon=False, bbox_to_anchor=(1, 1), fontsize=7)
ax1.set_ylabel('Error', fontsize=7)
ax1.set_xlabel('n_epochs', fontsize=7)
ax1.tick_params(axis='both', which='major', labelsize=6)
ax1.spines['top'].set_visible(False)
ax1.spines['right'].set_visible(False)
ax1.set_yscale('log')
plt.tight_layout()
plt.savefig(os.path.join('/mnt/thechenlab/Chenlei/spotmapping/fiducial/Figures/FigS2/FigS2ab_parameters_epoch_box.png'),dpi=300)
plt.show()
Read downsampling for recon library¶
For reviewers: UMI vs read downsampling¶
In [153]:
sample_files = os.listdir('/mnt/thechenlab/Chenlei/spotmapping/fiducial/data/c58_8/c58_8/')
downsample_folders = [item for item in sample_files if item.startswith('downsampling')]
downsample_folders.sort()
In [154]:
downsample_folders
Out[154]:
['downsampling001', 'downsampling003', 'downsampling005', 'downsampling01', 'downsampling02', 'downsampling03', 'downsampling04', 'downsampling05', 'downsampling06', 'downsampling07', 'downsampling08']
In [167]:
reads_list = []
UMI_list = []
for downsample in downsample_folders:
stats = pd.read_csv(os.path.join('/mnt/thechenlab/Chenlei/spotmapping/fiducial/data/c58_8/c58_8/',
downsample, sample+'_blind_statistics_filtered.csv'))
reads_list.append(stats.counts[0])
blind_raw = pd.read_csv(os.path.join('/mnt/thechenlab/Chenlei/spotmapping/fiducial/data/c58_8/c58_8/',
downsample, sample+'_blind_raw_reads_filtered.csv.gz'))
blind_sum = blind_raw.groupby(['R1_bc', 'R2_bc']).size().reset_index(name='cnt')
UMI_list.append(blind_sum.cnt.sum())
In [168]:
stats = pd.read_csv(os.path.join('/mnt/thechenlab/Chenlei/spotmapping/fiducial/data/c58_8/c58_8/', sample+'_blind_statistics_filtered.csv'))
reads_list.append(stats.counts[0])
blind_raw = pd.read_csv(os.path.join('/mnt/thechenlab/Chenlei/spotmapping/fiducial/data/c58_8/c58_8/', sample+'_blind_raw_reads_filtered.csv.gz'))
blind_sum = blind_raw.groupby(['R1_bc', 'R2_bc']).size().reset_index(name='cnt')
UMI_list.append(blind_sum.cnt.sum())
In [206]:
1-UMI_list[-1]/reads_list[-1]
Out[206]:
0.6148929129257803
In [214]:
# FigS2ad
# Scatter plot of the data
plt.figure(figsize=(5, 3))
plt.scatter(reads_list, UMI_list, alpha=0.5, label='downsampling')
# Fit a polynomial of degree 2
degree = 2
coeffs = np.polyfit(reads_list, UMI_list, degree)
poly_eqn = np.poly1d(coeffs)
x = np.linspace(0,2e8)
fitted_umi = poly_eqn(x)
# Plot the polynomial fit
plt.plot(x, fitted_umi, color='red',label='fit')
plt.title('UMI vs Reads Downsampling', fontsize=12)
plt.xlabel('Reads', fontsize=10)
plt.ylabel('UMI', fontsize=10)
plt.tick_params(axis='both', which='both',labelsize=8)
plt.legend(loc='upper left', frameon=False, bbox_to_anchor=(1, 1), fontsize=10)
plt.text(0.9e8, 5.5e7, 'saturation: 0.61')
plt.grid(True)
plt.tight_layout()
plt.savefig('/mnt/thechenlab/Chenlei/spotmapping/fiducial/Figures/FigS2/FigS2ad_umi_reads_downsampling.svg',dpi=300)
plt.show()
In [218]:
UMI_read_df = pd.DataFrame({'read':reads_list,'UMI':UMI_list})
UMI_read_df.to_csv(os.path.join(sample_folder, 'UMI_read_downsampling.csv'), index=False)
For reviewers: error vs reads downsampling¶
In [244]:
a_error = []
for downsample in downsample_folders:
print(downsample)
a_recon = pd.read_csv(os.path.join('/mnt/thechenlab/Chenlei/spotmapping/fiducial/data/c58_8/c58_8/',
downsample,'recon/recon_GPU_0_0.99_500000',anchor+'_recon_loc.csv'))
bc_pos_dict = Counter(pos['barcode'])
a_bc_matching_dict,_,_ = barcode_matching(bc_pos_dict, a_recon[anchor].values, max_dist=1)
a_truth, a_col = get_truth(list(a_bc_matching_dict.values()), pos)
a_recon_matched = a_recon[a_recon[anchor].isin(a_bc_matching_dict.keys())].copy()
a_recon_matched['barcode'] = a_recon_matched[anchor].map(a_bc_matching_dict)
a_recon_matched = a_recon_matched[['xcoord','ycoord','barcode']]
a_recon_matched = a_recon_matched.groupby('barcode').mean().reset_index()
a_recon_matched = a_recon_matched.sort_values(by=['barcode'])
da, a_truth_pa, a_recon_pa, a_comp = get_pa(a_truth,a_recon_matched)
print(da)
a_comp['r'].to_csv(os.path.join('/mnt/thechenlab/Chenlei/spotmapping/fiducial/data/c58_8/c58_8/',
downsample,'recon/recon_GPU_0_0.99_500000',anchor+'_error.csv'))
a_error.append(a_comp.r.values)
downsampling001 0.02547990335504945 downsampling003 0.005438390834688951 downsampling005 0.004133625068729526 downsampling01 0.004022352157472475 downsampling02 0.004126595294415976 downsampling03 0.004197480972069649 downsampling04 0.004191522502261081 downsampling05 0.00417070725918965 downsampling06 0.004244413996323651 downsampling07 0.004275181341595529 downsampling08 0.004272362545540993
In [245]:
a_error_1 = pd.read_csv('/mnt/thechenlab/Chenlei/spotmapping/fiducial/data/c58_8/c58_8/recon/V10_error.csv')
In [247]:
a_error.append(a_error_1.r.values)
In [268]:
a_error_df.to_csv(os.path.join(sample_folder, 'error_read_downsampling.csv'), index=False)
In [266]:
# FigS2ae
a_error_df = pd.DataFrame(a_error).T
a_error_df.columns = [f'{i/1000000:.1f}' for i in reads_list]
# Melt the DataFrame to long format for Seaborn
a_error_df_melted = a_error_df.melt(var_name='Reads', value_name='Error')
# Plot the box plot
plt.figure(figsize=(5.5, 3))
sns.boxplot(x='Reads', y='Error', data=a_error_df_melted)
plt.title('Error vs Reads Downsampling', fontsize=12)
plt.xlabel('Reads (1e6)', fontsize=10)
plt.ylabel('Error ($\mu$m)', fontsize=10)
plt.tick_params(axis='both', which='both',labelsize=8)
plt.ylim(0,200)
plt.tight_layout()
plt.savefig('/mnt/thechenlab/Chenlei/spotmapping/fiducial/Figures/FigS2/FigS2ae_error_reads_downsampling.svg',dpi=300)
plt.show()
Slide-seq data with reconstruction results¶
mRNA library has been run through Slide-seq pipeline and RCTD for cell type decomposition
In [6]:
adata_recon = sc.read_h5ad(os.path.join(sample_folder,'c58_8_recon_RCTD.h5ad'))
Fig1_g¶
In [369]:
bead_umap = pd.DataFrame(adata_recon.obsm['X_umap'], index=adata_recon.obs.index, columns=['UMAP1','UMAP2'])
bead_umap = bead_umap.merge(adata_recon.obs[['spot_class','first_type']], left_index=True, right_index=True)
In [1]:
type12 = ['CA3','Denate','Cajal_Retzius','Interneuron','CA1','Oligodendrocyte','Entorihinal','Microglia_Macrophages'
,'Astrocyte','Choroid','Neuron.Slc17a6','Ependymal']
In [379]:
data1 = bead_umap[(bead_umap['spot_class']=='singlet') & (bead_umap['first_type'].isin(type12))]
data2 = bead_umap[(bead_umap['spot_class']!='singlet') | (-bead_umap['first_type'].isin(type12))]
plt.figure(figsize=(3, 3))
sns.scatterplot(data2, x="UMAP1", y="UMAP2", color='#E3E3E3', s=2, edgecolor=None)
sns.scatterplot(data1, x="UMAP1", y="UMAP2", hue="first_type", s=2, hue_order=type12,
edgecolor=None, palette=sns.color_palette("husl", 12))
plt.legend(loc='upper left', frameon=False, bbox_to_anchor=(1, 1), fontsize=7)
plt.xlabel('UMAP1', fontsize=7)
plt.ylabel('UMAP2', fontsize=7)
plt.tick_params(axis='both', which='major', labelsize=5)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.savefig(os.path.join('/mnt/thechenlab/Chenlei/spotmapping/fiducial/Figures/FigS2','FigS2ah_umap_RCTD_12_type.svg'),dpi=300)
# plt.close()
plt.show()
Fig1_h¶
In [371]:
cell_type_bead = adata_recon.obs
bead_loc = pd.DataFrame(adata_recon.obsm['spatial'], index=adata_recon.obs.index, columns=['xcoord','ycoord'])
bead_loc.ycoord = - bead_loc.ycoord
cell_type_bead = cell_type_bead.merge(bead_loc, left_index=True, right_index=True)
In [374]:
## Fig2k
data1 = cell_type_bead[(cell_type_bead['spot_class']=='singlet') & (cell_type_bead['first_type'].isin(type12))]
data2 = cell_type_bead[(cell_type_bead['spot_class']!='singlet') | (-cell_type_bead['first_type'].isin(type12))]
plt.figure(figsize=(3, 3))
sns.scatterplot(data2, x="xcoord", y="ycoord", color='#E3E3E3', s=1, edgecolor=None)
sns.scatterplot(data1, x="xcoord", y="ycoord", hue="first_type", hue_order=type12,
s=1.5, edgecolor=None, palette=sns.color_palette("husl", 12))
# plt.title('Simulated Anchor Location ({})'.format(n_a), fontsize=16, pad=20)
# plt.annotate('simulated anchor location ({})'.format(n_a), xy=(0.5, -0), xycoords='axes fraction', ha='center', va='center', fontsize=16)
plt.legend(loc='upper left', frameon=False, bbox_to_anchor=(1, 1), fontsize=7)
plt.xlabel('X', fontsize=7)
plt.ylabel('Y', fontsize=7)
plt.tick_params(axis='both', which='major', labelsize=5)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.savefig(os.path.join('/mnt/thechenlab/Chenlei/spotmapping/fiducial/Figures/Fig2','Fig2k_RCTD_12_type.svg'),dpi=300)
# plt.close()
plt.show()
FigS3_g¶
In [7]:
adata_truth = sc.read_h5ad(os.path.join(sample_folder,'c58_8_truth_RCTD.h5ad'))
In [381]:
cell_type_bead = adata_truth.obs
bead_loc = pd.DataFrame(adata_truth.obsm['spatial'], index=adata_truth.obs.index, columns=['xcoord','ycoord'])
bead_loc.ycoord = - bead_loc.ycoord
cell_type_bead_truth = cell_type_bead.merge(bead_loc, left_index=True, right_index=True)
In [382]:
data1 = cell_type_bead_truth[(cell_type_bead_truth['spot_class']=='singlet') & (cell_type_bead_truth['first_type'].isin(type12))]
data2 = cell_type_bead_truth[(cell_type_bead_truth['spot_class']!='singlet') | (-cell_type_bead_truth['first_type'].isin(type12))]
plt.figure(figsize=(3, 3))
sns.scatterplot(data2, x="xcoord", y="ycoord", color='#E3E3E3', s=1, edgecolor=None)
sns.scatterplot(data1, x="xcoord", y="ycoord", hue="first_type", hue_order=type12,
s=1.5, edgecolor=None, palette=sns.color_palette("husl", 12))
# plt.title('Simulated Anchor Location ({})'.format(n_a), fontsize=16, pad=20)
# plt.annotate('simulated anchor location ({})'.format(n_a), xy=(0.5, -0), xycoords='axes fraction', ha='center', va='center', fontsize=16)
plt.legend(loc='upper left', frameon=False, bbox_to_anchor=(1, 1), fontsize=7)
plt.xlabel('X', fontsize=7)
plt.ylabel('Y', fontsize=7)
plt.tick_params(axis='both', which='major', labelsize=5)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.savefig(os.path.join('/mnt/thechenlab/Chenlei/spotmapping/fiducial/Figures/FigS2','FigS2l_RCTD_12_type_truth.svg'),dpi=300)
# plt.close()
plt.show()
CA1 width¶
In [ ]:
adata_recon_raw = sc.read_h5ad(os.path.join(sample_folder,'c58_8_recon_raw.h5ad'))
adata_truth_raw = sc.read_h5ad(os.path.join(sample_folder,'c58_8_truth_raw.h5ad'))
FigS1_j¶
In [363]:
sc.settings.figdir = '/mnt/thechenlab/Chenlei/spotmapping/fiducial/Figures/Fig2'
In [365]:
sc.pl.spatial(adata_truth_raw, color='Atp2b1', spot_size=30,
cmap='Blues',frameon=False, save='FigS2h_Atp2b1_truth.svg')
WARNING: saving figure to file /mnt/thechenlab/Chenlei/spotmapping/fiducial/Figures/Fig2/showFigS2h_Atp2b1_truth.svg
In [366]:
sc.pl.spatial(adata_recon_raw, color='Atp2b1', spot_size=30,
cmap='Blues',frameon=False, save='FigS2h_Atp2b1_recon.svg')
WARNING: saving figure to file /mnt/thechenlab/Chenlei/spotmapping/fiducial/Figures/Fig2/showFigS2h_Atp2b1_recon.svg
In [13]:
CA1_values = pd.read_csv(os.path.join('/mnt/thechenlab/Chenlei/spotmapping/fiducial/data_for_paper/slide_map_seq','CA1_width_fiji.csv'))
CA1_values['distance_um'] = (CA1_values.distance_pixel-100) * 3000 / 1030
In [7]:
plt.figure(figsize=(3, 1.5))
sns.lineplot(x=CA1_values['distance_um'].values.flatten()
, y=CA1_values['c58_8_recon'].values.flatten()/CA1_values['c58_8_recon'].values.max(), label='Reconstruction')
sns.lineplot(x=CA1_values['distance_um'].values.flatten() + 10
, y=CA1_values['c58_8_truth'].values.flatten()/CA1_values['c58_8_truth'].values.max(),
label='Ground Truth', linestyle='--')
plt.xlabel('Distance/um', fontsize=7)
plt.legend(loc='upper left', frameon=False, bbox_to_anchor=(1, 1), fontsize=7)
plt.ylabel('Atp2b1 relative count', fontsize=7)
# plt.grid(True, linestyle='--', alpha=0.7)
plt.xlim(-130,130)
plt.tick_params(axis='both', which='both',labelsize=5)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.tight_layout()
# plt.savefig(os.path.join('/mnt/thechenlab/Chenlei/spotmapping/fiducial/Figures/Fig2','Fig2h_3_CA_width_comparison.svg'),dpi=300)
plt.show()
FigS3_e¶
In [15]:
fwhm_list = []
for col in CA1_values.columns[1:-1]:
print(col)
x = CA1_values['distance_um'].values.flatten()
y = CA1_values[col].values.flatten()/CA1_values[col].values.max()
peak_value = np.max(y)
# Step 2: Find half maximum
half_max = peak_value / 2
# Step 3: Locate points on the distribution
# np.abs(y - half_max) finds the absolute difference between y values and half max
# np.argmin finds the index of the smallest value in this difference array,
# which will be closest to the half maximum
left_idx = np.argmin(np.abs(y[:len(y)//2] - half_max))
right_idx = np.argmin(np.abs(y[len(y)//2:] - half_max)) + len(y)//2
# Step 4: Calculate FWHM
fwhm= x[right_idx] - x[left_idx]
fwhm_list.append(fwhm)
print("FWHM:", fwhm)
c58_8_truth FWHM: 49.51456310679612 c58_8_recon FWHM: 43.689320388349515 c58_5_truth FWHM: 52.42718446601942 c58_5_recon FWHM: 49.51456310679612 c58_4_truth FWHM: 37.86407766990291 c58_4_recon FWHM: 32.03883495145631
In [24]:
fwhm_pd = pd.DataFrame({'fwhm':fwhm_list,'exp':[i[:5] for i in CA1_values.columns[1:-1]],'type':['truth','recon']*3})
In [60]:
plt.figure(figsize=(2.5, 1.5))
ax=sns.stripplot(data=fwhm_pd, x="type", y="fwhm",hue='exp',s=4,hue_order=['c58_8','c58_4','c58_5'],jitter=False)
plt.plot(np.linspace(-0.2,0.2,2),[np.mean(fwhm_pd[fwhm_pd.type=='truth'].fwhm.values)]*2,c='#B4B4B4')
plt.plot(np.linspace(0.8,1.2,2),[np.mean(fwhm_pd[fwhm_pd.type=='recon'].fwhm.values)]*2,c='#B4B4B4')
plt.xlim(-0.5,1.5)
plt.xlabel('', fontsize=7)
plt.ylabel('CA1 Width', fontsize=7)
plt.tick_params(axis='both', which='both',labelsize=6)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.legend(loc='upper left', frameon=False, bbox_to_anchor=(1, 1), fontsize=7)
plt.tight_layout()
plt.ylim(0,60)
# plt.savefig(os.path.join('/mnt/thechenlab/Chenlei/spotmapping/fiducial/Figures/FigS2','FigS2r_CA1_width_comparison.svg'),dpi=300)
plt.show()
Cell types¶
FigS4_a¶
In [9]:
RCTD_weights = pd.read_csv(os.path.join('/mnt/thechenlab/Chenlei/spotmapping/fiducial/data/c58_8/c58_8','slide_seq_update',
'RCTD_Plots_recon_min20/{}_weights.csv'.format(sample)))
RCTD_weights.columns = ['barcode'] + RCTD_weights.columns[1:].tolist()
RCTD_weights.set_index('barcode', inplace=True)
weights_bead = adata_recon.obs
weights_bead = weights_bead.merge(RCTD_weights, left_index=True, right_index=True)
bead_loc = pd.DataFrame(adata_recon.obsm['spatial'], index=adata_recon.obs.index, columns=['xcoord','ycoord'])
bead_loc.ycoord = - bead_loc.ycoord
weights_bead = weights_bead.merge(bead_loc, left_index=True, right_index=True)
In [15]:
type8 = ['Astrocyte', 'Interneuron', 'Microglia_Macrophages', 'CA1', 'Denate', 'Oligodendrocyte', 'Ependymal', 'CA3']
fig, axs = plt.subplots(2, 4, figsize=(8, 4))
for i, ax in enumerate(axs.flat):
sns.scatterplot(weights_bead, x="xcoord", y="ycoord", hue=type8[i], s=1, edgecolor=None,
palette=sns.color_palette("Blues", as_cmap=True), ax=ax, legend=False)
ax.set_title(type8[i], fontsize=7, pad=10)
ax.spines['top'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.set_xticks([])
ax.set_yticks([])
ax.set_xlabel('')
ax.set_ylabel('')
cbar_ax = fig.add_axes([0.06, 0.08, 0.12, 0.01]) # x-position, y-position, width, height
# Create the color bar
norm = plt.Normalize(vmin=0, vmax=1) # Assuming your gradient values are between 0 and 1
sm = plt.cm.ScalarMappable(cmap=sns.color_palette("Blues", as_cmap=True), norm=norm)
sm.set_array([])
cbar = fig.colorbar(sm, cax=cbar_ax, orientation='horizontal')
cbar.ax.tick_params(labelsize=5)
cbar.set_label('RCTD weight', fontsize=7, labelpad=1)
plt.savefig(os.path.join('/mnt/thechenlab/Chenlei/spotmapping/fiducial/Figures/FigS2','FigS2i_RCTD_8_type_weights.png'),dpi=300)
plt.tight_layout()
plt.show()
FigS4_b¶
In [17]:
RCTD_weights_truth = pd.read_csv(os.path.join('/mnt/thechenlab/Chenlei/spotmapping/fiducial/data/c58_8/c58_8','slide_seq_update',
'RCTD_Plots_truth_min20/{}_weights.csv'.format(sample)))
RCTD_weights_truth.columns = ['barcode'] + RCTD_weights_truth.columns[1:].tolist()
RCTD_weights_truth.set_index('barcode', inplace=True)
weights_bead_truth = adata_truth.obs
weights_bead_truth = weights_bead_truth.merge(RCTD_weights_truth, left_index=True, right_index=True)
bead_loc_truth = pd.DataFrame(adata_truth.obsm['spatial'], index=adata_truth.obs.index, columns=['xcoord','ycoord'])
bead_loc_truth.ycoord = - bead_loc_truth.ycoord
weights_bead_truth = weights_bead_truth.merge(bead_loc_truth, left_index=True, right_index=True)
In [19]:
type8 = ['Astrocyte', 'Interneuron', 'Microglia_Macrophages', 'CA1', 'Denate', 'Oligodendrocyte', 'Ependymal', 'CA3']
fig, axs = plt.subplots(2, 4, figsize=(8, 4))
for i, ax in enumerate(axs.flat):
sns.scatterplot(weights_bead_truth, x="xcoord", y="ycoord", hue=type8[i], s=1, edgecolor=None,
palette=sns.color_palette("Blues", as_cmap=True), ax=ax, legend=False)
ax.set_title(type8[i], fontsize=7, pad=10)
ax.spines['top'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.set_xticks([])
ax.set_yticks([])
ax.set_xlabel('')
ax.set_ylabel('')
cbar_ax = fig.add_axes([0.06, 0.08, 0.12, 0.01]) # x-position, y-position, width, height
# Create the color bar
norm = plt.Normalize(vmin=0, vmax=1) # Assuming your gradient values are between 0 and 1
sm = plt.cm.ScalarMappable(cmap=sns.color_palette("Blues", as_cmap=True), norm=norm)
sm.set_array([])
cbar = fig.colorbar(sm, cax=cbar_ax, orientation='horizontal')
cbar.ax.tick_params(labelsize=5)
cbar.set_label('RCTD weight', fontsize=7, labelpad=1)
plt.title('Ground Truth', fontsize=7, pad=5)
plt.savefig(os.path.join('/mnt/thechenlab/Chenlei/spotmapping/fiducial/Figures/FigS2','FigS2m_RCTD_8_type_weights_truth.png'),dpi=300)
plt.tight_layout()
plt.show()
Gene expression¶
FigS5_a¶
In [ ]:
adata_recon_raw = sc.read_h5ad(os.path.join(sample_folder,'c58_8_recon_raw.h5ad'))
adata_truth_raw = sc.read_h5ad(os.path.join(sample_folder,'c58_8_truth_raw.h5ad'))
In [72]:
gene8 = ['Plp1', 'Hpca', 'Enpp2', 'Gad1', 'Prox1', 'Sst', 'Atp2b1', 'Slc17a7']
sc.pl.spatial(adata_recon_raw, color=gene8, spot_size=30,
cmap='Blues',frameon=False, save='FigS2k_8_gene.png')
WARNING: saving figure to file /mnt/thechenlab/Chenlei/spotmapping/fiducial/Figures/FigS2/showFigS2k_8_gene.png
FigS5_b¶
In [73]:
gene8 = ['Plp1', 'Hpca', 'Enpp2', 'Gad1', 'Prox1', 'Sst', 'Atp2b1', 'Slc17a7']
sc.pl.spatial(adata_truth_raw, color=gene8, spot_size=30,
cmap='Blues',frameon=False,save='FigS2n_8_gene_truth.png')
WARNING: saving figure to file /mnt/thechenlab/Chenlei/spotmapping/fiducial/Figures/FigS2/showFigS2n_8_gene_truth.png
Nearest Neighbor analysis¶
FigS4g¶
In [385]:
import squidpy as sq
In [383]:
adata_recon_RCTD = adata_recon[adata_recon.obs.spot_class=='singlet']
In [386]:
sq.gr.spatial_neighbors(adata_recon_RCTD, coord_type='generic')
sq.gr.nhood_enrichment(adata_recon_RCTD, cluster_key="first_type")
0%| | 0/1000 [00:00<?, ?/s]
In [387]:
sq.pl.nhood_enrichment(adata_recon_RCTD, cluster_key="first_type", figsize=(5, 5), vmin=-45, vmax=170,
save='/mnt/thechenlab/Chenlei/spotmapping/fiducial/Figures/FigS2/FigS2t_neighborhood_recon.png')
In [388]:
adata_truth_RCTD = adata_truth[adata_truth.obs.spot_class=='singlet']
In [389]:
sq.gr.spatial_neighbors(adata_truth_RCTD, coord_type='generic')
sq.gr.nhood_enrichment(adata_truth_RCTD, cluster_key="first_type")
0%| | 0/1000 [00:00<?, ?/s]
In [390]:
sq.pl.nhood_enrichment(adata_truth_RCTD, cluster_key="first_type", figsize=(5, 5), vmin=-45, vmax=170,
save='/mnt/thechenlab/Chenlei/spotmapping/fiducial/Figures/FigS2/FigS2u_neighborhood_truth.png')
Compare recon vs truth matched bead / UMI stats¶
In [10]:
compare_sum = pd.DataFrame({'sample':['data','data','replicate 1','replicate 1','replicate 2','replicate 2','data','data','replicate 1','replicate 1','replicate 2','replicate 2'],
'type':['truth','recon','truth','recon','truth','recon','truth','recon','truth','recon','truth','recon'],
'data_type':['matched_bead','matched_bead','matched_bead','matched_bead','matched_bead','matched_bead','matched_umi','matched_umi','matched_umi','matched_umi','matched_umi','matched_umi'],
'data':[1, len(adata_recon_c58_8.obs_names)/ len(adata_truth_c58_8.obs_names),
1, len(adata_recon_c58_4.obs_names)/ len(adata_truth_c58_4.obs_names),
1, len(adata_recon_c58_5.obs_names)/ len(adata_truth_c58_5.obs_names),
1, np.sum(adata_recon_c58_8.obs['total_counts'])/np.sum(adata_truth_c58_8.obs['total_counts']),
1, np.sum(adata_recon_c58_4.obs['total_counts'])/np.sum(adata_truth_c58_4.obs['total_counts']),
1, np.sum(adata_recon_c58_5.obs['total_counts'])/np.sum(adata_truth_c58_5.obs['total_counts'])]})
In [11]:
bead_mean = np.mean([len(adata_recon_c58_8.obs_names)/ len(adata_truth_c58_8.obs_names),len(adata_recon_c58_4.obs_names)/ len(adata_truth_c58_4.obs_names),len(adata_recon_c58_5.obs_names)/ len(adata_truth_c58_5.obs_names)])
In [12]:
umi_mean = np.mean([np.sum(adata_recon_c58_8.obs['total_counts'])/np.sum(adata_truth_c58_8.obs['total_counts']),
np.sum(adata_recon_c58_4.obs['total_counts'])/np.sum(adata_truth_c58_4.obs['total_counts']),
np.sum(adata_recon_c58_5.obs['total_counts'])/np.sum(adata_truth_c58_5.obs['total_counts']),])
In [20]:
## Fig2p
plt.figure(figsize=(2, 2))
ax=sns.stripplot(data=compare_sum, x="data_type", y="data",hue='type',s=4)
plt.plot(np.linspace(-0.2,0.2,2),[1]*2,c='#1f77b4')
plt.plot(np.linspace(-0.2,0.2,2),[bead_mean]*2,c='orange')
plt.plot(np.linspace(0.8,1.2,2),[1]*2,c='#1f77b4')
plt.plot(np.linspace(0.8,1.2,2),[umi_mean]*2,c='orange')
plt.xlim(-0.5,1.5)
plt.xlabel('', fontsize=7)
plt.ylabel('Normalized by truth', fontsize=7)
plt.tick_params(axis='both', which='both',labelsize=6)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
sns.move_legend(ax, "upper left", bbox_to_anchor=(0, 1))
plt.tight_layout()
plt.ylim(0.8,2)
plt.savefig(os.path.join('/mnt/thechenlab/Chenlei/spotmapping/fiducial/Figures/Fig2','Fig2p_matched_counts_comparison.svg'),dpi=300)
plt.show()
FigS4_h¶
In [5]:
# load gene expression data
adata = sc.read_text(os.path.join(sample_folder, 'Puck_230818_08.all_illumina.digital_expression.txt.gz'), delimiter='\t')
adata = adata.T
In [393]:
# match bc from gene expression to recon / in situ sequencing bc list with hamming distance=1, collapse gene expression barcodes
a_bc_cell_matching_dict,_,_ = barcode_matching(Counter(a_recon[anchor].values), adata.obs_names.tolist(), max_dist=1)
a_recon_id = a_recon.set_index(anchor)
adata_recon = adata[adata.obs_names.isin(a_bc_cell_matching_dict.keys())].copy()
adata_recon_df = pd.DataFrame(adata_recon.X, index=adata_recon.obs_names, columns=adata_recon.var_names)
adata_recon_df['obs_name_column'] = [a_bc_cell_matching_dict[i] for i in list(adata_recon.obs_names)]
adata_recon_df_grouped = adata_recon_df.groupby('obs_name_column').sum()
adata_recon = sc.AnnData(adata_recon_df_grouped)
adata_recon.obs_names = adata_recon_df_grouped.index
adata_recon.obsm["spatial"] = a_recon_id.loc[list(adata_recon.obs_names)][['xcoord','ycoord']].values
a_bc_truth_matching_dict,_,_ = barcode_matching(Counter(pos['barcode'].values), adata.obs_names.tolist(), max_dist=1)
pos_id = pos.set_index('barcode')
adata_truth = adata[adata.obs_names.isin(a_bc_truth_matching_dict.keys())].copy()
adata_truth_df = pd.DataFrame(adata_truth.X, index=adata_truth.obs_names, columns=adata_truth.var_names)
adata_truth_df['obs_name_column'] = [a_bc_truth_matching_dict[i] for i in list(adata_truth.obs_names)]
adata_truth_df_grouped = adata_truth_df.groupby('obs_name_column').sum()
adata_truth = sc.AnnData(adata_truth_df_grouped)
adata_truth.obs_names = adata_truth_df_grouped.index
adata_truth.obsm["spatial"] = pos_id.loc[list(adata_truth.obs_names)][['xcoord','ycoord']].values
In [30]:
adata_recon.write(os.path.join(sample_folder,'c58_8_recon_raw.h5ad'))
adata_truth.write(os.path.join(sample_folder,'c58_8_truth_raw.h5ad'))
In [62]:
print('total recon bc: ', len(a_recon))
print('gex bc matched to recon: ',len(a_bc_cell_matching_dict))
print('matched recon bc: ',len(set(a_bc_cell_matching_dict.values())))
print('total in situ seq bc (including fiducial): ', len(pos))
print('gex bc matched to in situ sequencing: ',len(a_bc_truth_matching_dict))
print('matched in situ seq bc: ',len(set(a_bc_truth_matching_dict.values())))
print('total bc from gex: ', len(adata.obs_names))
print('total umi to recon: ', adata_recon.X.sum())
print('total umi to in situ sequencing: ', adata_truth.X.sum())
total recon bc: 19408 gex bc matched to recon: 63605 matched recon bc: 19401 total in situ seq bc (including fiducial): 71690 gex bc matched to in situ sequencing: 50785 matched in situ seq bc: 21370 total bc from gex: 101252 total umi to recon: 4793752.0 total umi to in situ sequencing: 4033182.0
In [444]:
adata_recon_raw.X.sum()/adata_truth_raw.X.sum()
Out[444]:
1.1885781
FigS4_i¶
In [61]:
## FigS4ac
plt.figure(figsize=(2.5, 2))
sns.violinplot(x="Data", y="total_counts",data=umi_df, alpha=0.6)
plt.xlabel('Slide-seq library matched with', fontsize=8)
plt.ylabel('log10(UMI per bead)', fontsize=8)
# plt.grid(True, linestyle='--', alpha=0.7)
plt.tick_params(axis='both', which='both',labelsize=6)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
# plt.yscale('log')
plt.tight_layout()
# plt.ylim(0,2000)
plt.savefig('/mnt/thechenlab/Chenlei/spotmapping/fiducial/Figures/FigS2/FigS2ac_umi_per_bead_violin.svg',dpi=300)
plt.show()
In [411]:
sns.set(style="white")
In [439]:
## FigS4ac
plt.figure(figsize=(2.5, 2))
sns.violinplot(x="Data", y="total_counts",data=umi_df[umi_df.total_counts>np.log10(20)], alpha=0.8)
plt.xlabel('Slide-seq library matched with', fontsize=8)
plt.ylabel('log10(UMI per bead)', fontsize=8)
# plt.grid(True, linestyle='--', alpha=0.7)
plt.tick_params(axis='both', which='both',labelsize=6)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
# plt.yscale('log')
plt.tight_layout()
# plt.ylim(0,2000)
plt.savefig('/mnt/thechenlab/Chenlei/spotmapping/fiducial/Figures/FigS2/FigS2ai_umi_per_bead_violin_filtered.svg',dpi=300)
plt.show()
In [79]:
print('recon UMI per bead mean: ',adata_recon.obs.total_counts.mean())
print('truth UMI per bead mean: ',adata_truth.obs.total_counts.mean())
print('recon UMI per bead median: ',adata_recon.obs.total_counts.median())
print('truth UMI per bead median: ',adata_truth.obs.total_counts.median())
recon UMI per bead mean: 247.08788 truth UMI per bead mean: 188.73102 recon UMI per bead median: 145.0 truth UMI per bead median: 97.0
In [446]:
len(umi_df[(umi_df.total_counts>np.log10(20)) & (umi_df.Data=='in situ seq')])
Out[446]:
16484
In [447]:
len(umi_df[(umi_df.total_counts>np.log10(20)) & (umi_df.Data=='recon')])
Out[447]:
18999
In [448]:
len(umi_df[(umi_df.total_counts>np.log10(20)) & (umi_df.Data=='recon')])/len(umi_df[(umi_df.total_counts>np.log10(20)) & (umi_df.Data=='in situ seq')])
Out[448]:
1.1525721912157243
In [ ]: